import pandas as pd
#!pip install tqdm
from tqdm import tqdm #progress bar
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy.stats import pearsonr
from scipy import stats
import requests
import json
import time
import matplotlib.dates as mdates
excel_file_path = r"E:\Columbia University\APAN Term3\Capstone\RC Projection data since 2022 28AUG24.xlsx"
excel_file = pd.ExcelFile(excel_file_path)
sheets_to_load = ['2022', '2023', '2024', 'Workday Calendar', 'Location hierarchy', 'Fleet Count']
dfs = {}
for sheet in tqdm(sheets_to_load, desc="Loading sheets"):
dfs[sheet] = pd.read_excel(excel_file, sheet_name=sheet)
df_2022 = dfs['2022']
df_2023 = dfs['2023']
df_2024 = dfs['2024']
workday_calendar = dfs['Workday Calendar']
location_df = dfs['Location hierarchy']
fleet_count = dfs['Fleet Count']
Loading sheets: 100%|████████████████████████████████████████████████████████████████████| 6/6 [02:15<00:00, 22.53s/it]
df_total = pd.concat([df_2022, df_2023, df_2024], axis=0, ignore_index=True)
df_cleaned = df_total.loc[:, ~df_total.columns.str.contains('^Unnamed')]
workday_months = workday_calendar[['MONTH', 'WD_IN_MNTH', 'DAYS_IN_MNTH', 'SATURDAY_COUNT', 'WEEKDAY_HOLIDAYS']]
workday_months = workday_months.dropna(how='all')
workday_days = workday_calendar[['MONTH.1', 'DT', 'DT_KEY', 'WD_IN_MNTH.1', 'DAYS_IN_MNTH.1', 'SATURDAY_COUNT.1', 'WEEKDAY_HOLIDAYS.1',
'Saturdays in Month', 'Workdays in Month', 'Max Workday', 'Max Saturday',
'WD Remaining', 'Saturday Remaining']]
# Group the data by MONTH and calculate the sum of TRANSACTION_AMOUNT for each month
monthly_data = df_cleaned.groupby('MONTH')['TRANSACTION_AMOUNT'].sum().reset_index()
# Plotting the monthly transaction amounts
plt.figure(figsize=(12, 6))
plt.plot(monthly_data['MONTH'], monthly_data['TRANSACTION_AMOUNT'], marker='o', linestyle='-')
plt.title('Monthly Transaction Amounts (Historical Data)')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
# Display the plot
plt.show()
# Group the data by YEAR and MONTH to calculate the sum of TRANSACTION_AMOUNT for each month
monthly_data_by_year = df_cleaned.groupby(['YEAR', 'MONTH'])['TRANSACTION_AMOUNT'].sum().reset_index()
# Create a plot for each year
plt.figure(figsize=(15, 10))
# Plot for 2022
plt.subplot(3, 1, 1) # 3 rows, 1 column, first subplot
data_2022 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2022-01-01']
plt.plot(data_2022['MONTH'], data_2022['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='b')
plt.title('Monthly Transaction Amounts for 2022')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()
# Plot for 2023
plt.subplot(3, 1, 2) # 3 rows, 1 column, second subplot
data_2023 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2023-01-01']
plt.plot(data_2023['MONTH'], data_2023['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='g')
plt.title('Monthly Transaction Amounts for 2023')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()
# Plot for 2024
plt.subplot(3, 1, 3) # 3 rows, 1 column, third subplot
data_2024 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2024-01-01']
plt.plot(data_2024['MONTH'], data_2024['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='r')
plt.title('Monthly Transaction Amounts for 2024')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()
# Adjust layout for better spacing
plt.tight_layout()
# Display the plot
plt.show()
# Daily Historical Cost for each Month
plt.figure(figsize=(20, 15))
for month in range(1, 13): # 1 to 12 for 2022
month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
plt.subplot(3, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['TRANSACTION_AMOUNT'], linestyle='-', color='b', linewidth=2) # Adjusted line width
plt.title(f'Daily Costs for 2022-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Total Daily Cost')
# Format x-ticks to show only the date (day and month)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
# Create a plot for each month of 2022
plt.figure(figsize=(20, 15))
for month in range(1, 13): # 1 to 12 for 2022
month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_22['CUMULATIVE_TRANSACTION_COST'] = daily_cost_22['TRANSACTION_AMOUNT'].cumsum()
plt.subplot(3, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['CUMULATIVE_TRANSACTION_COST'],
linestyle='-', color='b', linewidth=2)
plt.title(f'Cumulative Costs for 2022-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Total Cost')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d')) # Show only day and month
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
# Create a plot for each month of 2022
plt.figure(figsize=(20, 15))
for month in range(1, 13): # 1 to 12 for 2022
month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
# Count occurrences of FIN_SOURCE_TYPE_DESC
fin_source_counts = month_data_22['FIN_SOURCE_TYPE_DESC'].value_counts().head(5) # Get top 5 types
plt.subplot(3, 4, month) # 3 rows, 4 columns, month plot
fin_source_counts.plot(kind='bar', color='gray') # Bar plot for counts
plt.title(f'Top 5 FIN_SOURCE_TYPE_DESC for 2022-{month:02d}')
plt.xlabel('Financial Source Type')
plt.ylabel('Count')
#plt.xticks(rotation=15) # Rotate x-ticks for better visibility
plt.grid(axis='y')
plt.tight_layout()
plt.show()
# Daily Historical Cost for each Month in 2023
plt.figure(figsize=(20, 15))
for month in range(1, 13): # 1 to 12 for 2023
month_data_23 = df_2023[df_2023['MONTH'].dt.month == month]
daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
plt.subplot(3, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['TRANSACTION_AMOUNT'], linestyle='-', color='orange', linewidth=1) # Adjusted line width
plt.title(f'Daily Costs for 2023-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Total Daily Cost')
# Format x-ticks to show only the date (day and month)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
# Create a plot for each month of 2023
plt.figure(figsize=(20, 15))
for month in range(1, 13): # 1 to 12 for 2023
month_data_23 = df_2023[df_2023['MONTH'].dt.month == month]
daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_23['CUMULATIVE_TRANSACTION_COST'] = daily_cost_23['TRANSACTION_AMOUNT'].cumsum()
plt.subplot(3, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['CUMULATIVE_TRANSACTION_COST'],
linestyle='-', color='orange', linewidth=1)
plt.title(f'Cumulative Costs for 2023-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Total Cost')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d')) # Show only day and month
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
补一张Top 5 FIN_SOURCE_TYPE_DESC for 2023 (和上面2022 一样)
# Daily Historical Cost for each Month in 2024
plt.figure(figsize=(20, 15))
for month in range(1, 9): # 1 to 12 for 2024
month_data_24 = df_2024[df_2024['MONTH'].dt.month == month]
daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
plt.subplot(2, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['TRANSACTION_AMOUNT'], linestyle='-', color='pink', linewidth=2) # Adjusted line width
plt.title(f'Daily Costs for 2024-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Total Daily Cost')
# Format x-ticks to show only the date (day and month)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
# Daily Historical Cost for each Month in 2024
plt.figure(figsize=(20, 15))
for month in range(1, 9): # 1 to 12 for 2024
month_data_24 = df_2024[df_2024['MONTH'].dt.month == month]
daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_24['CUMULATIVE_TRANSACTION_COST'] = daily_cost_24['TRANSACTION_AMOUNT'].cumsum()
plt.subplot(2, 4, month) # 3 rows, 4 columns, month plot
plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['CUMULATIVE_TRANSACTION_COST'], linestyle='-', color='pink', linewidth=2) # Adjusted line width
plt.title(f'Cumulative Costs for 2024-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Daily Cost')
# Format x-ticks to show only the date (day and month)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45) # Rotate x-ticks for better visibility
plt.grid()
plt.tight_layout()
plt.show()
补一张Top 5 FIN_SOURCE_TYPE_DESC for 2024 (和上面2022 一样)
以上都是national base的,下面需要一样的图for region和location,有问题可以看他的recording 13分钟的那个
df_cleaned.head()
| MONTH | YEAR | GL_TRANSACTION_EFFECTIVE_DT | GL_EFF_WD_NBR | FIN_SOURCE_TYPE_DESC | LOCATION_CD | LOCATION_DESC | BU_DESC | REGION_DESC | TRANSACTION_AMOUNT | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-01-01 | 2022-01-01 | 2022-01-02 | 0 | INTERDISTRICT BILLING | 514 | TUPELO | OPS - MID SOUTH BU | OPS-SOUTH REGION | 52.0 |
| 1 | 2022-01-01 | 2022-01-01 | 2022-01-02 | 0 | INTERDISTRICT BILLING | 1554 | MANSFIELD | OPS - GREAT LAKES BU | OPS-NORTH REGION | 52.0 |
| 2 | 2022-01-01 | 2022-01-01 | 2022-01-02 | 0 | INTERDISTRICT BILLING | 30 | LANDOVER MD | OPS - DMV BU | OPS-NORTH REGION | 52.0 |
| 3 | 2022-01-01 | 2022-01-01 | 2022-01-02 | 0 | INTERDISTRICT BILLING | 284 | BRIDGETON MISSOURI | OPS - GREAT PLAINS BU | OPS-CENTRAL REGION | 208.0 |
| 4 | 2022-01-01 | 2022-01-01 | 2022-01-02 | 0 | INTERDISTRICT BILLING | 1278 | SANFORD NC | OPS - CAROLINAS BU | OPS-SOUTH REGION | 52.0 |
unique_regions = df_2024['REGION_DESC'].unique()
print(unique_regions)
['OPS-CENTRAL REGION ' 'OPS-SOUTH REGION ' 'OPS-NORTH REGION ' 'OPS-WEST REGION ']
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
plt.figure(figsize=(20, 15))
regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']
for month in range(1, 13):
plt.subplot(3, 4, month)
for region in regions:
month_data_22 = df_2022[(df_2022['MONTH'].dt.month == month) & (df_2022['REGION_DESC'] == region)]
daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_22['CUMULATIVE_TRANSACTION_COST'] = daily_cost_22['TRANSACTION_AMOUNT'].cumsum()
plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['CUMULATIVE_TRANSACTION_COST'],
linestyle='-', label=region)
plt.title(f'Cumulative Costs for 2022-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Daily Cost')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45)
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
plt.figure(figsize=(20, 15))
regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']
for month in range(1, 13):
plt.subplot(3, 4, month)
for region in regions:
month_data_23 = df_2023[(df_2023['MONTH'].dt.month == month) & (df_2023['REGION_DESC'] == region)]
daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_23['CUMULATIVE_TRANSACTION_COST'] = daily_cost_23['TRANSACTION_AMOUNT'].cumsum()
plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['CUMULATIVE_TRANSACTION_COST'],
linestyle='-', label=region)
plt.title(f'Cumulative Costs for 2023-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Daily Cost')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45)
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# Create a figure for 8 months
plt.figure(figsize=(20, 10))
regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']
for month in range(1, 9): #Only 1 to 8 month for 2024
plt.subplot(2, 4, month)
for region in regions:
month_data_24 = df_2024[(df_2024['MONTH'].dt.month == month) & (df_2024['REGION_DESC'] == region)]
daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
daily_cost_24['CUMULATIVE_TRANSACTION_COST'] = daily_cost_24['TRANSACTION_AMOUNT'].cumsum()
plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['CUMULATIVE_TRANSACTION_COST'],
linestyle='-', label=region)
plt.title(f'Cumulative Costs for 2024-{month:02d}')
plt.xlabel('Date')
plt.ylabel('Cumulative Daily Cost')
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
plt.xticks(rotation=45)
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()